/**********************************************************************/
/*
   	Author: Karan Makkar, Michelle Han, Adapted from Nikhil Kumar
	Last Updated: June 2024
   	Description: Cleans raw March 2022 SUSENAS data. Outputs deidentified
	cleaned SUSENAS data merged with PMO data.

	This cleaning file should output 2 different datasets:
	1. Cleaned Susenas data:
	sus_mar22_deid_clean
	2. Subset of Susenas person-batch data matched with PMO:
	sus_mar22_deid_clean_merged

*/
/**********************************************************************/


*******************************************
* Setup

* include filepaths 
  	if "$master_run" !="1" include "./Do/SET_FILEPATHS.do"
	cap log close
	global prefix: display %tdCYND td(`c(current_date)')
	log using "$KP_logs/${prefix}_clean_SUSENAS_mar2022.txt", text replace

	use "$KP_deid_susenas/Raw/Individual/SUSENAS_MAR22_deid.dta", clear

	merge 1:1 urut r401 using "$KP_deid_susenas/Raw/Individual/susenas22mar_ind_prakerja", assert(3) nogen keepusing(r709)

	rename urut URUT
	
	merge m:1 URUT using "$KP_deid_susenas/Raw/Household/Block43-mar2022.dta", assert(3) nogen
	merge m:1 URUT using "$KP_deid_susenas/Raw/Household/mar22_kor22rt rev20221230.dta", assert(3) nogen

*Switch to lowercase
foreach var of varlist R1* R2* R3* {
	cap rename `var' `=strlower("`var'")'
	if _rc !=0 drop `var'
}

*******************************************
* Recode yes/no questions 

	la def yesno 0 "No" 1 "Yes"

	local yesno_vars r2001* ///
					 r2201*2 ///
					 r2203 ///
					 r2205aa ///
					 r2205ba ///
					 r2206aa ///
					 r2207  ///
					 r2208a* ///
					 r2209* ///
					 r2210aa ///
					 r2210ba ///
					 r2211a ///
					 r2211b ///
					 r709 ///
					 r617 ///
					 r609 ///

	foreach var of varlist `yesno_vars' {
		assert `var' == 1 | `var' == 5 | `var' == .
		recode `var' (1=1) (5=0)
		replace `var' = 0 if mi(`var') // for questions dependent on previous skip logic
		la val `var' yesno
	}

*******************************************
* Demographics

* Age
	gen age_sus = r407
	recode age_sus (0/20 = 1) (21/40 = 2) (41/60 = 3) (61/80 = 4) (81/100 = 5), gen(age_cat)

* Year DOB
    gen year_dob_sus = r406c

* Month DOB
    gen month_dob_sus = r406b
	replace month_dob_sus = . if month_dob_sus == 98

* Day DOB	
	gen day_dob_sus = r406a
	replace day_dob_sus = . if day_dob_sus == 98
	
* Dummy for age <=30	
	gen young = 0 
	replace young = 1 if age_sus <= 30
	la def young 0 "Over 30 Years Old" 1 "30 and Under"
	la val young young 

* Marital status
	gen marital_status = r404

* Age First Married
	gen age_married = r409

* Gender
	gen gender = r405 == 1
	la def gender 1 "Male" 0 "Female"
	la val gender gender

	gen female = r405 == 2
	la def female 1 "Female" 0 "Male"
	la val female female

* Education (translated to years of schooling)
	gen educ = r612
	gen school_years = .
	replace school_years = 6 if inlist(educ, 1, 2, 3, 4) // Paket A, SDLB, SD, MI
	replace school_years = 9 if inlist(educ, 5, 6, 7, 8, 9, 10) // Paket B, SMP LB, SMP, MTs
	replace school_years = 12 if inlist(educ, 11, 12, 13, 14, 15, 16, 17) // Paket C, SMLB, SMA, MA, SMK, MAK
	replace school_years = 13.5 if educ == 18 // Diploma I/II
	replace school_years = 15 if educ == 19 // Diploma III
	replace school_years = 16 if inlist(educ, 20, 21) // Diploma IV / SI
	replace school_years = 18 if inlist(educ, 22, 23) // SII/Profesi
	replace school_years = 21 if educ == 24 // SIII
	replace school_years = 3 if educ == 25

* Dummy for educated (graduating high school)
	gen educated = school_years > 12 
	la def educated 0 "Less than 12 Years Schooling" 1 "12 or More Years of Schooling"
	la val educated educated

* Literacy Dummy
	gen literate_latin = r609 ==1

* Number of HH members
	gen hh_size_sus = r301

* Num Children
	gen child = age_sus < 18
	bysort URUT: egen num_child = total(child)

* City (kotamadya) dummy
	gen kode_kab = r102
	gen urban_sus = r105 == 1

* Do you (applicant) live in same kabupaten you were born in?
	gen born_kab = r602
	destring born_kab, replace
	gen born_live_same = born_kab == kode_kab

* Kabupaten 5 years ago
	gen last_kab = r604
	destring last_kab, replace
	gen city_sus_5 = inrange(last_kab, 70, 79)

* Java 5 years ago
	gen last_prov = r603
	destring last_prov, replace
	gen java_sus_5 = inrange(last_prov, 31, 36)

* Relation to Head of Household
    gen relation_hh_head = r403

* HH Survey Taker Dummy
	gen surveytaker = r410 == r401
*******************************************
* Disability

* disabilities
  tab r1002, m
  gen vision_disabled = r1002 == 1 | r1002 == 2 | r1002 == 3
  gen vision_disability = r1002

  tab r1003, m
  gen hearing_disabled = r1003 == 5 | r1003 == 6 | r1003 == 7
  gen hearing_disability = r1003

  tab r1004, m
  gen walk_disabled = r1004 == 1 | r1004 == 2 | r1004 == 3
  gen walk_disability = r1004

  tab r1005, m
  gen hand_disabled = r1005 == 5 | r1005 == 6 | r1005 == 7
  gen hand_disability = r1005

	tab r1006, m
  gen att_disabled = r1006 == 1 | r1006 == 2 | r1006 == 3
  gen att_disability = r1006

	tab r1007, m
	gen emo_disabled = r1007 == 5 | r1007 == 6 | r1007 == 7
	gen emo_disability = r1007

  tab r1008, m
  gen speech_disabled = r1008 == 1 | r1008 == 2 | r1008 == 3
  gen speech_disability = r1008

  summ *disabled

  gen any_disability = (vision_disabled + hearing_disabled + walk_disabled + hand_disabled + speech_disabled + att_disabled + emo_disabled) >= 1
  tab any_disability, m

  gen severe_disability = inlist(r1002, 1, 2) | ///
													inlist(r1003, 5, 6)  | ///
													inlist(r1004, 1, 2)  | ///
													inlist(r1005, 5, 6)  | ///
													inlist(r1006, 1, 2)  | ///
													inlist(r1007, 5, 6)  | ///
													inlist(r1008, 1, 2) 
  tab severe_disability


*******************************************
* Consumption

* Consumption
* food
	gen cons_food = FOOD/r301
	gen lcon_food = log(cons_food)

	gen cons_adj_food = cons_food
	sum cons_adj_food if cons_adj_food > 0, d
	replace cons_adj_food = r(p99) if cons_adj_food > r(p99) & !mi(cons_adj_food)
	replace cons_adj_food = r(p1) if cons_adj_food < r(p1) & !mi(cons_adj_food)
	replace cons_adj_food = cons_adj_food / 1000
	la var cons_adj_food "Adjusted Food Consumption (in 1000s of rupiah, trimmed of 1st and 99th pct)"	
	sum cons_adj_food, d
	gen lcon_adj_food = log(cons_adj_food)

* non-food
	gen cons_nfood = NONFOOD/r301
	gen lcon_nfood = log(cons_nfood)

	gen cons_adj_nfood = cons_nfood
	sum cons_adj_nfood if cons_adj_nfood > 0, d
	replace cons_adj_nfood = r(p99) if cons_adj_nfood > r(p99) & !mi(cons_adj_nfood)
	replace cons_adj_nfood = r(p1) if cons_adj_nfood < r(p1) & !mi(cons_adj_nfood)
	replace cons_adj_nfood = cons_adj_nfood / 1000
	la var cons_adj_nfood "Adjusted Non-Food Consumption (in 1000s of rupiah, trimmed of 1st and 99th pct)"
	sum cons_adj_nfood, d	
	gen lcon_adj_nfood = log(cons_adj_nfood)

* tot consumption
	gen tot_cons = cons_food + cons_nfood
	gen ltot_cons = log(tot_cons)

	gen tot_adj_cons = tot_cons
	sum tot_adj_cons if tot_adj_cons > 0, d
	replace tot_adj_cons = r(p99) if tot_adj_cons > r(p99) & !mi(tot_adj_cons)
	replace tot_adj_cons = r(p1) if tot_adj_cons < r(p1) & !mi(tot_adj_cons)
	replace tot_adj_cons = tot_adj_cons / 1000
	la var tot_adj_cons "Adjusted Total Consumption (in 1000s of rupiah, trimmed of 1st and 99th pct)"		
	sum tot_adj_cons, d
	gen ltot_adj_cons = log(tot_adj_cons)

* Durable goods 
* 5.5 kilos or more LPG canister
	gen dur_canister = r2001a 

* fridge/freezer
	gen dur_fridge = r2001b 

*  air conditioner	
	gen dur_aircon = r2001c 

* water heater 	
	gen dur_waterheat = r2001d

* telephone 
	gen dur_phone = r2001e 

* computer
	gen dur_computer = r2001f 

* gold	
	gen dur_gold = r2001g 

* motorcycle	
	gen dur_bike = r2001h 

* boat	
	gen dur_boat = r2001i

* motorboat	
	gen dur_motorboat = r2001j

* car	 
	gen dur_car = r2001k 

* flat television	
	gen dur_tv = r2001l 

* land	
	gen dur_land = r2001m

* Check
	foreach var of varlist dur_* {
		tab `var', m
		la val `var' yesno
	}



*******************************************
* Employment

* work status questions
	gen employment_status = r706
	gen self_employed = r706 == 1
	gen business_owner = r706 == 2 | r706 == 3
	gen perm_worker = r706 == 4
	gen freelancer = r706 == 5
	gen family_unpaid = r706 == 6
	gen self_emp_bus_owner = (r706 == 1 | r706 == 2 | r706 == 3)

 gen employed = inlist(employment_status, 1, 2, 3, 4, 5)
	*gen employed = r702_a == "A"
	label define emp 1 "Self-employed" 2 "Business with temp emp" ///
			3 "Business with perm emp" ///
			4 "Permanent Worker" 5 "Freelancer" 6 "Family/unpaid worker", replace
	lab val employment_status emp
	tab employed employment_status, m row

	gen hrs_worked_main = r707
	replace hrs_worked_main = 0 if employed == 0
	gen hrs_worked_all = r708
	replace hrs_worked_all = 0 if employed == 0

	bysort URUT: egen hh_hrs_worked_all = total(hrs_worked_all)
* --in the last week, did you work or have work or a business but temporarily not working?  (same as SAKARNAS 9A + 10A --can also stack and add a dummy)
	gen temp_not_work = r704
	recode temp_not_work (1=1) (5=0)
	replace temp_not_work = 0 if employed
	tab temp_not_work, m
	tab temp_not_work employment_status, m
	tab temp_not_work employed, m

* Household business owners
	bys URUT: egen hh_business = max(business_owner)
	bys URUT: egen hh_emp_self = max(self_employed)

* Industry code--26 industries (stack with Sakarnas industries potentially)
	rename r705 ind_code

* Does anyone in the household own a business with employees (either formal or informal)
	gen with_emp = inlist(r706, 2, 3)
	bys URUT: egen with_emp_HH_S = max(with_emp)


*******************************************
* Telecom

* Overall, use internet
	gen use_internet = r808 == 1

* In the last 3 months, have you (applicant) accessed internet using PC/laptop/tablet
	gen use_internet_comp = r809_a == "A" | r809_b == "B" | r809_c == "C"

* In the last 3 months, have you (applicant) accessed internet using cellphone
	gen use_internet_phone = r809_d == "D"

* In the last 3 months, have you (applicant) accessed internet for learning
	gen use_internet_learn = r811_b == "B"

* In the last 3 months, have you (applicant) accessed internet for shopping
	gen use_internet_shop = r811_e == "E"

* In the last 3 months, have you (applicant) accessed internet for banking
	gen use_internet_bank = r811_h == "H"


*******************************************
* Social Assistance

* 617: receive PIP 
	gen receive_PIP = r617
	tab receive_PIP

* 2201: social security 
	gen assist_vet = r2201a2 
	gen assist_pen = r2201b2 
	gen assist_workinsur = r2201c2
	gen assist_lifeinsur = r2201d2
	gen assist_sevpay = r2201e2 

* 2202: Family Card Hejahtera (KKS)
	gen receive_KKS = (r2202 == 1 | r2202 == 2)
	la val receive_KKS yesno
	tab receive_KKS

* 2203:  PKH last year
	gen receive_pkh_year = r2203
	tab receive_pkh_year

* 2204a:  PKH Now
	gen receive_pkh_now = r2204a ==1
	tab receive_pkh_now

* 2204b:  PKH Receipt Method
	gen pkh_receipt_method = r2204b

* 2205: Assistance for elderly (not from PKH) 
	gen assist_elderly = r2205aa 
	tab assist_elderly

/* 2206 Assistance for people with disabilities (not from PKH)
	gen assist_disable = r2206aa
	tab assist_disable*/

* 2207: receive BPNT 
	gen ever_BPNT = r2207
	tab ever_BPNT

* 2208: receive BPNT by month 
	gen feb22_BPNT = r2208a2
	tab feb22_BPNT

	gen jan22_BPNT = r2208a3
	tab jan22_BPNT

	gen dec21_BPNT = r2208a4
	tab dec21_BPNT

	gen nov21_BPNT = r2208a5
	tab nov21_BPNT

* 2209B: receive BLT-DD 
	gen receive_BLTDD = r2209b
	tab receive_BLTDD

* Individual receives any Kartu Prakerja
	gen get_pk = r709 == 1
	tab get_pk

* 2211a: routine assistance from local governments
	gen r_assist = r2211a 
	tab r_assist 

* 2211b: non-routine assistance from local governments
	gen nr_assist = r2211b
	tab nr_assist

	gen nr_assist_amt = r2211b1 
	replace nr_assist_amt = 0 if nr_assist == 0
	replace nr_assist_amt = nr_assist_amt/1000

	gen nr_assist_kp = inlist(r2211b1, 600000, 1200000, 1800000, 2400000)
	tab nr_assist_kp

* Dropping sampling unit strata var
	drop strata

*******************************************
* House Characteristics

	gen house_floor = r1808
	gen dirt_floor = r1808 ==8 if !mi(r1808)

	gen house_roof = r1806
	gen bamboo_roof = r1806 == 5 if !mi(r1806)

	gen house_area = r1804

	gen house_toilet = r1809a
	gen toilet_inhouse = inrange(r1809a,1,2) if !mi(r1809a)

	gen house_pln_elec = r1816
	gen has_pln_elec = inlist(r1816,1,2) if !mi(r1816)

/*----------------------------------------------------*/
              * Export cleaned data
/*----------------------------------------------------*/

  rename fwt weight
  datasignature
    	if "`r(datasignature)'" == "1237946:647(43664):3426778012:499227228" {
    	save "$KP_deid_susenas/Clean/sus_mar22_deid_clean.dta", replace
      	}
  	else {
    	di as err "Careful, your machine produces a different dataset"
    	stop
		}
  

/*----------------------------------------------------*/
			* Merge with admin data and export
/*----------------------------------------------------*/

	gen sus_round = 8
	keep if !mi(anon_id4)
	rename r101 m101
	drop r1* r2* r3* r4* r5* r6* r7* r8* r9* date_incentive
	rename m101 r101

* check duplicates
	duplicates tag anon_id4, gen(anon_id4_dupe)
	tab anon_id4_dupe

* drop duplicates with more missing data
	egen missing = rowmiss(*)
	sort anon_id4 missing
	bysort anon_id4: gen n = _n
	drop if n != 1 & !missing(anon_id4)
	drop n

	merge 1:m anon_id4 using "$KP_deid_admin/Clean/pmo_b1-22_clean_long_deid.dta", nogen keep(3)

	rename urban urban_pmo 
	la var urban_pmo "Urban (PMO Data, based on city ID)"

	rename urban_sus urban 
	la var urban "Urban (SUSENAS survey data)"

* generate variable to indicate date_incentive is within 4 months of survey date 
	gen sus_end_date = date("01mar2022","DMY")
	format sus_end_date %td 

	gen sus_start_date = date("01sep2021","DMY")
	format sus_start_date %td

	gen incentive_near_sus = inrange(date_incentive,sus_start_date,sus_end_date)
	tab incentive_near_sus 

	drop *_dupe missing sus_start_date sus_end_date


	compress
	datasignature
    	if "`r(datasignature)'" == "203672:225(78697):904927901:3059941469" {
    	save "$KP_deid_susenas/Clean/sus_mar22_deid_clean_merged.dta", replace
      	}
  	else {
    	di as err "Careful, your machine produces a different dataset"
    	stop
		}
	
